# install.packages("xfun") # uncoomment if need to install xfun
xfun::pkg_attach2(c("haven","dplyr","ggplot2"))
mydata <- read_dta("Figures02_03.dta") %>% 
  select(collegemore, response) %>% 
  na.omit()

order <- 15
r.square <- matrix(NA,order,1)
adj.r.square <- matrix(NA,order,1)
mse <- matrix(NA,order,1)

set.seed(1952)
mydata.r <- mydata[sample(nrow(mydata)),]
k <- 10
folds <- cut(seq(1,nrow(mydata.r)), breaks=k, labels=FALSE)
r.square = matrix(NA,k,order)

for(i in 1:k){
  #Segement your data by fold using the which() function
  testIndexes <- which(folds==i,arr.ind=TRUE)
  testData <- mydata.r[testIndexes, ]
  trainData <- mydata.r[-testIndexes, ]
  #Use the test and train data partitions
  
  for (j in 1:order){
    fit.train = lm(collegemore ~ poly(response,j), data = trainData)
    fit.test = predict(fit.train, newdata=testData)
    r.square[i,j] = cor(fit.test, testData$collegemore, use='complete')^2
  } 
}

# Examine Results
colMeans(r.square)
r.square

# which order has highest r-square?
which(colMeans(r.square)==max(colMeans(r.square)))

mydata <- mydata %>% 
  mutate(title = "(1) Education Attainment in Polls")

myplots <- list(`Unit Response Rate` = ggplot(mydata, aes(x = response, y = collegemore)) +
                  geom_point() +
                  stat_smooth(method='lm', formula = y ~ poly(x,3), size = 1, color = "black", fill = "white") +
                  scale_x_reverse() +
                  theme_bw() +
                  theme(panel.grid = element_blank(), panel.background = element_rect(fill = "darkgray")) +
                  ylab('Proportion with Higher Education') + 
                  xlab('Unit Response Rate') +
                  facet_wrap(~ title))

# install.packages("xfun") # uncoomment if need to install xfun
xfun::pkg_attach2(c("haven","dplyr","ggplot2"))
mydata <- read_dta("Figures02_03.dta") %>% 
  select(survey, year, collegemore) %>% 
  na.omit() %>% 
  group_by(survey, year) %>% 
  summarise(collegemore = mean(collegemore)) %>% 
  group_by(year) %>% 
  summarise(collegemore = mean(collegemore)) %>% 
  mutate(source = "Polls") 

census <- readxl::read_xlsx("Figure02EducationAttainment.xlsx") %>% 
  select(year = 1, collegemore = 2) %>% 
  mutate(source = "Census",
         collegemore = collegemore / 100) %>% 
  filter(year %in% mydata$year)

mydata <- mydata %>% 
  bind_rows(., census) %>% 
  mutate(title = "(2) Education Attainment in Census and Polls",
         Source = factor(source,
                         levels = c("Polls", "Census")))

myplots[["Time"]] <- ggplot(mydata, aes(x = year, y = collegemore, color = Source)) +
  geom_point() +
  geom_line() +
  scale_color_grey(end = 1) +
  theme_bw() +
  theme(panel.grid = element_blank(), legend.key = element_rect(fill = "darkgray", color = "black"),
        panel.background = element_rect(fill = "darkgray"),
        legend.position = "bottom") +
  ylab('Proportion with Higher Education') + 
  xlab('Year') +
  facet_wrap(~ title)

pdf("Figure02.pdf", width = 8, height = 4)
patchwork::wrap_plots(myplots, guides = "collect") & theme(legend.position = 'bottom')
dev.off()